/*
 Copyright 2014-Present Algorithms in Motion LLC
 
 This file is part of FDTD++.
 
 FDTD++ is proprietary software: you can use it and/or modify it
 under the terms of the Algorithms in Motion License as published by
 Algorithms in Motion LLC, either version 1 of the License, or (at your
 option) any later version.
 
 FDTD++ is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 Algorithms in Motion License for more details.
 
 You should have received a copy of the Algorithms in Motion License
 along with FDTD++. If not, see <http://www.aimotionllc.com/licenses/>.
*/
// CREATED    : 9/8/2015
// LAST UPDATE: 9/30/2015

#include "statsxx/machine_learning/restricted_Boltzmann_machine/RBM.hpp"

// STL
#include <cmath>     // std::log()
#include <limits>    // std::numeric_limits<>::max()
#include <tuple>     // std::tuple<>, std::make_tuple(), std::tie()

// jScience
#include "jScience/linalg.hpp"  // Matrix<>, Vector<>, transpose()

#include "jrandnum.hpp"         // rand_num_uniform_Mersenne_twister()

// stats++
#include "statsxx/machine_learning/activation_functions.hpp" // activation_function::Logistic


// basic algorithm for randomly search for initial weights and biases
inline double restricted_Boltzmann_machine::RBM::initialize_weights(
                                                                    const int ntrials,        // number of trials to conduct
                                                                    const Matrix<double> &X,  // data
                                                                    const bool x_binomial     // whether the data is binomial
                                                                    )
{
    Matrix<double> W_best;  // best weight matrix
    Vector<double> b_best;  // best set of visible neuron biases
    Vector<double> c_best;  // best set of hidden neuron biases
    
    // note: we need the mean of each input over the entire training set
    Vector<double> x_mean(X.size(1), 0.0);
    
    // << jmm: I really need to define the mean() functions, etc. for Vector<> ... then you could call mean() function using each column of X >>
    for(auto i = 0; i < X.size(0); ++i)
    {
        for(auto j = 0; j < X.size(1); ++j)
        {
            x_mean(j) += X(i,j);
        }
    }
    
    x_mean /= X.size(0);
    
    // set the best error found to the worst possible scenario
    double best_error = std::numeric_limits<double>::max();
    
    for(int i = 0; i < ntrials; ++i)
    {
        // DETERMINE A TRIAL SET OF WEIGHTS AND BIASES
        
        // randomly sample the entire weight matrix
        this->W = Matrix<double>(this->nhid, this->nvis);

        // note: a range of variations are used, one for each trial 
        double variation = 4.0*rand_num_uniform_Mersenne_twister(0.0, 1.0)/std::sqrt(std::sqrt(static_cast<double>(nvis*nhid)));
        
        for(auto j = 0; j < this->W.size(0); ++j)
        {
            for(auto k = 0; k < this->W.size(1); ++k)
            {
                this->W(j,k) = variation*rand_num_uniform_Mersenne_twister(-0.5, 0.5);
            }
        }
        
        // compute the hidden biases such that the net input to it is 0 for an average training case 
        this->c = -1.0*this->W*x_mean;
        
        // compute the visible biases such that the probabilities of the reconstructed neurons are as close as possible to the inputs observed in the training set, for average inputs and hidden activations
        Vector<double> p_mean(nhid, 0.0);
        
        activation_function::Logistic logistic;
        
        for(auto j = 0; j < X.size(0); ++j)
        {
            Vector<double> p = logistic.f(this->c + this->W*X.row(j));
            
            p_mean += p;
        }
        
        p_mean /= X.size(0);
        
        this->b = Vector<double>(this->nvis);
        
        Vector<double> WT_p_mean = transpose(W)*p_mean;
        
        for(auto j = 0; j < this->b.size(); ++j)
        {
            this->b(j) = std::log(x_mean(j)/(1.0 - x_mean(j))) - WT_p_mean(j);
        }
        
        
        // COMPUTE RECONSTRUCTION ERROR FOR THESE TRIAL WEIGHTS AND BIASES
        double error = 0.0;
        
        for(auto j = 0; j < X.size(0); ++j)
        {
            Vector<double> v;
            Vector<double> h;
            double error_j;
            std::tie(v, h, error_j) = reconstruct(X.row(j), x_binomial);
            
            error += error_j;
        }
        
        // note: we normalize the error simply for reporting (so one can compare to stochastic_gradient(), etc.)
        error /= X.size(0);
        
        // COMPARE ERROR TO OUR BEST, AND POSSIBLY UPDATE PARAMETERS
        if(error < best_error)
        {
            W_best = this->W;
            b_best = this->b;
            c_best = this->c;
            
            best_error = error;
        }
    }
    
    this->W = W_best;
    this->b = b_best;
    this->c = c_best;
    
    return best_error;
}

